import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.read_csv("data/data.csv")
df.head()
df.shape
df.info()
plt.figure(figsize=(10,10))
sns.pairplot(data=df)
If you need to do further analysis about the data, please follow the Auto EDA library (Pandas Profiling)
pip install pandas-profiling
#importing required packages
import pandas as pd
import pandas_profiling
import numpy as np
#descriptive statistics
profile = pandas_profiling.ProfileReport(df)
profile
profile.to_file("EDA_report.html")
df.describe()
We can see Unnamed: 0 column is useless because it doesn't give any meaningful thing and it's just an index column. Therefore we can remove that column from the dataset.
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.columns
To check the number of missing values in the dataset,
df.isnull().sum()
# To visulize it
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()
We can see Ratings, RAM, ROM, Mobile_Size, Selfi_Cam variables have missing values and we need to treat them first.
In here we will apply bellow techniques to deal with missing values and among them we will find what is the most appropriate missing value technique for each variable.
When the data follow normal distribution, mean value replacement is the best method. But when the data skewed it's better to do the meadian value replacement.
missing_var = ['Ratings', 'RAM', 'ROM', 'Mobile_Size', 'Selfi_Cam']
Put all the missing value variables into an another dataframe and name it as df_missing.
df_missing= df[['Ratings', 'RAM', 'ROM', 'Mobile_Size', 'Selfi_Cam']]
df_missing.head()
To look at the distributions of the variables which has the missing values.
plt.figure(figsize=(5,5))
sns.pairplot(data=df_missing)
By looking at the above graph's diagonal(histograms), it shows all the varbales are skewed. Therefore we will use median imputation technique.
# If the variable has a missing value, this function will repalce that missing value by it's median
def impute_nan(df,variable):
df[variable+"_median"]=df[variable].fillna(df[variable].median())
# run above function for all the missing value variable
for i in missing_var:
impute_nan(df_missing,i)
df_missing.head()
df_missing.isnull().sum()
# If the variable has a missing value, this function will repalce that missing value by a random number (this random number will choose from the relavant variable)
def impute_nan_random(df,variable):
df[variable+"_random"]=df[variable]
##It will have the random sample to fill the na
random_sample=df[variable].dropna().sample(df[variable].isnull().sum(),random_state=0)
##pandas need to have same index in order to merge the dataset
random_sample.index=df[df[variable].isnull()].index
df.loc[df[variable].isnull(),variable+'_random']=random_sample
for i in missing_var:
impute_nan_random(df_missing,i)
df_missing.head()
df_missing.isnull().sum()
#Comparison of the distrbutions before and after deali with missing values
def comparison_plot(df, variable):
fig = plt.figure()
ax = fig.add_subplot(111)
df[variable].plot(kind='kde', ax=ax)
df[variable+"_median"].plot(kind='kde', ax=ax, color='red')
df[variable+"_random"].plot(kind='kde', ax=ax, color='green')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')
for i in missing_var:
comparison_plot(df_missing,i)
df.columns[0]
#Compare the standard values before and after imputation
def compare_std(df, variable):
print("original std of ", variable, "is ", df[variable].std(),
"and after median repalcement std: ", df[variable+"_median"].std(),
"and after random number repalcement std: ", df[variable+"_random"].std())
for i in missing_var:
compare_std(df_missing,i)
By looking at the kernel density plots and the standard values, we can obtained random number imputation method is the best method as it doesn't distrote the original distribution's behaviors.
#Combine df_missing and df dataframes
df_missing.head()
df_missing.columns
df_new = pd.concat([df, df_missing[['Ratings_random', 'RAM_random', 'ROM_random', 'Mobile_Size_random',
'Selfi_Cam_random']]], axis = 1)
df_new.head()
#remove the unnecessary columns
df_new.drop(['Ratings', 'RAM', 'ROM', 'Mobile_Size','Selfi_Cam'], axis = 1, inplace = True)
df_new.head()
df_new.isnull().sum()
Now we have a clean dataset
df_new['Brand me'].dtype
df_new['Brand me'] = df_new['Brand me'].str.split(' ',1).str[0]
df_new['Brand me'].value_counts()
df_new.head()
import seaborn as sns
plt.figure(figsize=(15,10))
sns.set_style('whitegrid')
sns.countplot(x='Brand me',
data=df_new,
order = df_new['Brand me'].value_counts().index)
plt.xticks(rotation=75)
plt.show()
In this chart we can see, there are 71 mobile products in this datset and there are only few companies has sales more than 20. Therefore we select only the top 10 brands in the dataset and we rename other variables as Other category.
#Get the list of top 10 elements
top10 = df_new['Brand me'].value_counts().index[:10]
top10
df_new['Brand me'] = np.where(df_new['Brand me'].isin(top10), df_new['Brand me'], 'Other')
df_new['Brand me'].nunique()
df_new['Brand me'].value_counts()
Now we have a categorical feature and it has 11 categories. Therefore we need to encode them.
brand_list = df_new['Price'].groupby(df_new['Brand me']).mean().sort_values(ascending=False)
brand_list
We an see there's a relationship between price and the brand. Therefore we will use nominal encoding method to encode the Brand me variable.
#get the list
brand_list.index
brand_list.nunique()
#First convert to Categorical
df_new['Brand me'] = df_new['Brand me'].astype('category')
#rename the columns using a dictionary
df_new['Brand me'] = df_new['Brand me'].cat.rename_categories({'Apple': 11,
'Samsung': 10,
'Vivo': 9,
'OPPO': 8,
'Other': 7,
'Nokia': 6,
'Lava': 5,
'Micax': 4,
'Karbonn': 3,
'I': 2,
'Kechaoda':1
})
df_new['Brand me'].value_counts()
df_new.head()
df_new.shape
Understanding the correlation of features between target and other features.
corr = df_new.corr()
corr.shape
# Plotting the heatmap of correlation between features
plt.figure(figsize=(10,10))
sns.heatmap(corr, cbar=False, square= True, annot=True, cmap='Blues')
By looking at the above heatmap there's no multicollinearity in the dataset. You can confirm it by using the bellow function.
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation(df_new, 0.7)
len(set(corr_features))
This function also implies, there's no inter correlation between independent variables.
df_new.head()
df_new.columns
X = df_new[['Brand me', 'Primary_Cam', 'Battery_Power', 'Ratings_random',
'RAM_random', 'ROM_random', 'Mobile_Size_random', 'Selfi_Cam_random']]
y = df_new['Price']
# Fiting Feature Seclection using Ensemble Methods
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)
# Plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()
sns.distplot(df_new['ROM_random'])
figure=df_new.boxplot(column="ROM_random")
df_new['ROM_random'].describe()
df_new.loc[df_new['ROM_random']>64, :]
outliers = df_new.loc[df_new['ROM_random']>64, :]
outliers['Brand me'].value_counts()
We can see 7-11 categories have the outliers and it may be can occure becuase their avergae prices are higher than the others. So usually when the prices are increasing, features also increasing. TBy looking at the above stats it is unfair to replace them with other values. So we don't touch with these outliers as it give us very usefull information.
We can clearly see the scales of the variables are very different. Therefore we need to apply a transformation technique at the end.
#### standarisation: We use the Standardscaler from sklearn library
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
### fit vs fit_transform
X=scaler.fit_transform(X)
X
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)
# Make predictions using the testing set
y_pred_lr = regr.predict(X_test)
sns.histplot(y_test-y_pred_lr)
plt.show()
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_test, y_pred_lr)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_lr))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred_lr))
# Training Score
print("Training Accuracy:",regr.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",regr.score(X_test,y_test)*100)
We can see accuracies are really poor. Therefore linear regression is not a good model.
from sklearn.linear_model import ElasticNet
EN_model = ElasticNet(alpha=1.0, l1_ratio=0.5)
EN_model.fit(X_train,y_train)
# Training Score
print("Training Accuracy:",EN_model.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",EN_model.score(X_test,y_test)*100)
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_train,y_train)
y_pred_etc = model.predict(X_test)
sns.histplot(y_test-y_pred_etc)
plt.show()
plt.scatter(y_test, y_pred_etc, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred_etc))
print('MSE:', metrics.mean_squared_error(y_test, y_pred_etc))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_etc)))
# RMSE/(max(DV)-min(DV))
35630.175346287186/(max(y)-min(y))
# Training Score
print("Training Accuracy:",model.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",model.score(X_test,y_test)*100)
We can see there's an overfitting issue in this model as it has massive difference between training accuracy and testing accuracy.
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(X_train,y_train)
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()
# Training Score
print("Training Accuracy:",reg.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",reg.score(X_test,y_test)*100)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
This is a good model. Therefore we will optimize this model to improve the performances.
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)
y_pred = reg.predict(X_test)
# Training Score
print("Training Accuracy:",regressor.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",regressor.score(X_test,y_test)*100)
from sklearn.model_selection import RandomizedSearchCV
#Randomized Search CV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = 1)
rf_random.fit(X_train,y_train)
rf_random.best_params_
best_random_grid=rf_random.best_estimator_
prediction = best_random_grid.predict(X_test)
plt.figure(figsize = (8,8))
sns.distplot(y_test-prediction)
plt.show()
plt.figure(figsize = (8,8))
plt.scatter(y_test, prediction, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))
# Training Score
print("Training Accuracy:",best_random_grid.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",best_random_grid.score(X_test,y_test)*100)
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [rf_random.best_params_['n_estimators']+50,
rf_random.best_params_['n_estimators']-50],
'min_samples_split': [rf_random.best_params_['min_samples_split']+1,
rf_random.best_params_['min_samples_split']+2,
rf_random.best_params_['min_samples_split']+3],
'min_samples_leaf': [rf_random.best_params_['min_samples_leaf']],
'max_features': [rf_random.best_params_['max_features']],
'max_depth': [rf_random.best_params_['max_depth'] - 2,
rf_random.best_params_['max_depth'] - 1,
rf_random.best_params_['max_depth'],
rf_random.best_params_['max_depth'] +1,
rf_random.best_params_['max_depth'] + 2]
}
print(param_grid)
#### Fit the grid_search to the data
grid_search=GridSearchCV(estimator=reg,param_grid=param_grid,cv=5,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)
grid_search.best_estimator_
best_grid=grid_search.best_estimator_
best_grid
# Training Score
print("Training Accuracy:",best_grid.score(X_train,y_train)*100)
# Testing Score
print("Testing Accuracy:",best_grid.score(X_test,y_test)*100)
It reduce the both accuracies. Therefore we will go with the randomized search method's model as a best model.
import pickle
# open a file, where you ant to store the data
file = open('mobile_price_rg.pkl', 'wb')
# dump information to that file
pickle.dump(best_random_grid, file)
#Load the model
model = open('mobile_price_rg.pkl','rb')
reg_rand = pickle.load(model)
y_prediction = reg_rand.predict(X_test)
metrics.r2_score(y_test, y_prediction)